##
##
## This script calculates physician flows at the hospital-year level  
## where weights = patient charges and number of unique benefiary IDs 
## 
## Note: start tracking in 2004 = need raw data from 2003 for denominator
##
##
## The bash command is python3 wtd_transitions.py 2003 2014 1 
##
## 


import pandas as pd 
import sys
import csv 
import time 

# function to loop through years and implement 
def apply(d, y, vars, docid): 

	temp = d.loc[d['year'].isin([y, y-1]), :]

	# label each physician-hospital pair as entrant, exit, or stay
	t = temp.loc[:,  [docid, 'year', 'id']].groupby(['id', docid]).apply(gen_counts, (y))
	t = t.reset_index(['id', docid])
  	
  	# hospital denominator (avg between both y and y-1) for each variable in vars 
	den = gen_denominator(temp, vars)

	# merge hospital denominator info to physician entry/exit/stayer labels
	out = t.merge(den, how = "inner", on = "id")

	# merge the physician info on charges, unique patients, etc.
	# for the exits, the numerator will be Y-1 information. 
	out['year'] = y 
	out.loc[out["exits"] == True, "year"] = y-1 
	temp = temp[['id', docid, 'year', 'npatphys', 'totchrg_phys']]
	out = out.merge(temp, on = ['id', docid, "year"])
  	
	# get weights
	out = gen_weights(out)

	# roll up to hospital level
	out["year"] = y
	out = out.groupby(["id", "year"]).agg({'stayers': 'sum',\
										   'exits': 'sum', \
										   'entrants': 'sum',\
										   'wpat_entrants': 'sum',\
										   'wpat_exits': 'sum',\
										   'wpat_stayers': 'sum',\
										   'wcost_entrants': 'sum',\
										   'wcost_exits': 'sum',\
										   'wcost_stayers': 'sum',\
										   'den_npathosp': 'mean',\
										   'den_totchrg_hosp': 'mean'})
	return out.reset_index()



# generate weights for each physician by charges and patient counts
def gen_weights(out):
	
	# patient weighted
	ptwt = (out["npatphys"] / out["den_npathosp"])
	out["wpat_entrants"] = out["entrants"] * ptwt 
	out["wpat_exits"] = out["exits"] * ptwt
	out["wpat_stayers"] = out["stayers"] * ptwt 
	
	# cost weighted
	cwt = (out["totchrg_phys"] / out["den_totchrg_hosp"])
	out["wcost_entrants"] = out["entrants"] * cwt 
	out["wcost_exits"] = out["exits"] * cwt 
	out["wcost_stayers"] = out["stayers"] * cwt 
	 
	return out 


# for each hospital, compute the average total number of patients and 
# the average total physician charges 
def gen_denominator(temp, vars): 

	# hospital denominator for charges 
	hospchrg = temp.groupby(['id', 'year'])['totchrg_phys'].sum().rename("totchrg_hosp")

	# hospital denominator for patient volume
	hospnpat = temp.groupby(['id', 'year'])['npatphys'].sum().rename("npathosp")

	# for each hospital, calculate the mean charges and number of patients 
	# between y and y-1 
	hospnpat = hospnpat.reset_index().groupby('id')['npathosp'].mean()
	hospnpat = hospnpat.rename('den_npathosp').reset_index()
	hospchrg = hospchrg.reset_index().groupby('id')['totchrg_hosp'].mean()
	hospchrg = hospchrg.rename('den_totchrg_hosp').reset_index()

	# merge
	den = hospnpat.merge(hospchrg, on = ['id'], how = 'inner')

	return den 




# function to label docs as entrants/ stayers/ exiters
def gen_counts(g, y): 

	# get the hospital synthetic id and doc id
	(id, n) = g.name 

	output = pd.DataFrame({'stayers': [], 'exits':[], 'entrants':[]})


	if g.shape[0] == 2:
		output['stayers'] = [True]
		output['exits'] = [False]
		output['entrants'] = [False]
	if g.shape[0] == 1:
		output['stayers'] = [False]
		output['exits'] = [(g.year.min() == y-1)]
		output['entrants'] = [(g.year.min() == y)]

	return output




####################################################################################
# Implementation
####################################################################################
start_time = time.time()

# directories 
dataRoot = "/disk/agedisk4/medicare.work/sacarny-DUA51934/shruthi-dua51934/physician_flows"

# use alterate id concept? 
alt = int(sys.argv[3])
altid = "altid_" * alt 

# read the provider file 
pct = "100"
d = pd.read_csv(dataRoot + "/data/combined/providers_" + altid + "match_pct" + pct + "_20230606.csv")

# get the year 
y_start = int(sys.argv[1]) 
y_end = int(sys.argv[2])

# doc id concept being used (npi or synthetic docid)
docid = "docid"

# calculate transitions for each year between start and end 
# remember, do not start in the first year! need y-1 
for y in range(y_start, y_end + 1):
	
	print("Processing transitons for " + str(y))

	vars = ['npathosp', 'totchrg_hosp']
	t = apply(d, y, vars, docid)

	# write to csv 
	t.to_csv(dataRoot + "/data/transitions/transitions_" + altid + "wtd" + str(y) + "pct" + pct + "_20230606.csv", index = False)

	print("Finished writing " + altid + " weighted transitions for " + str(y))


## 
end_time = time.time()
print("Elapsed time: "+ str(round((end_time - start_time)/3600, 5)) + " hours")






